library(tidyverse)
library(excessmort)
library(kableExtra)
library(lubridate)
library(MASS)
library(pdftools)biostats620_final_project
Packages Used
Question 1: Population Sizes by Age Group and Sex
dat <- puerto_rico_counts
#marginal counts, we may want to collapse some of these (maybe into 10 yr bins?)
t1 <- dat %>%
group_by(agegroup, sex) %>%
summarize(count = mean(population), .groups = "drop") %>%
pivot_wider(names_from = sex, values_from = count, values_fill = 0)
kable(t1)| agegroup | female | male |
|---|---|---|
| 0-4 | 118886.60 | 124166.89 |
| 5-9 | 128338.20 | 134027.91 |
| 10-14 | 137254.48 | 142834.59 |
| 15-19 | 140546.33 | 145330.13 |
| 20-24 | 136900.96 | 135803.44 |
| 25-29 | 129586.13 | 122670.16 |
| 30-34 | 124736.05 | 114279.74 |
| 35-39 | 123391.99 | 110548.18 |
| 40-44 | 122339.68 | 108089.43 |
| 45-49 | 116511.76 | 102230.66 |
| 50-54 | 111973.38 | 96919.54 |
| 55-59 | 104562.71 | 89331.28 |
| 60-64 | 94151.71 | 80090.52 |
| 65-69 | 82889.24 | 69814.59 |
| 70-74 | 67546.00 | 55648.71 |
| 75-79 | 52283.22 | 41288.09 |
| 80-84 | 35543.43 | 26154.14 |
| 85-Inf | 34932.77 | 21848.33 |
#stratified plots
dat %>% ggplot(aes(date, population)) +
geom_line(aes(color = agegroup)) +
facet_wrap(~sex) +
labs(title = "Population Size by Age Group and Gender",
x = "Year",
y = "Population",
color = "Age Group") weekly_deaths <- dat %>%
filter(year(date) < 2017) %>%
mutate(mmwr_week = epiweek(date),
mmwr_year = epiyear(date)) %>%
group_by(agegroup, sex, mmwr_week, mmwr_year) %>%
summarise(total_deaths = sum(outcome, na.rm = TRUE), population = mean(population)) %>%
ungroup()`summarise()` has grouped output by 'agegroup', 'sex', 'mmwr_week'. You can
override using the `.groups` argument.
#adding a plot to comment on why older folks are sticking around longer
weekly_deaths |>
filter(mmwr_year < 2017) |>
group_by(mmwr_year) %>%
summarize(mortality = mean(total_deaths/population)*1000, year = mmwr_year) %>%
ggplot(aes(x=year, y= mortality)) +
geom_line(color = 'blue') +
labs(title = "Yearly Mortality Rates (per 1,000): 1985-2016",
x = "Year",
y = "Rate")`summarise()` has grouped output by 'mmwr_year'. You can override using the
`.groups` argument.
Considering the population size of Puerto Rico from 1985 to 2022, we see differing trends across age groups. For both males and females, the population size for older age groups is increasing, while younger age groups are decreasing. We see this particularly in the 0-4 age group for both sexes.
When examining the general trend in mortality rates over time, we see a decrease from 1985 to 2016. This could help to explain why we see increased population sizes for older age groups, potentially due to healthier lifestyles and improvements in medical care access and quality.
Question 2: Expected mortality Before 2017
t2 <- weekly_deaths %>%
# looking at trends/ estimate what a typical week looks like across years
group_by(mmwr_week, agegroup, sex) %>%
summarise(mean_deaths = mean(total_deaths),
sd = sd(total_deaths)) %>%
ungroup() `summarise()` has grouped output by 'mmwr_week', 'agegroup'. You can override
using the `.groups` argument.
kable(head(t2, 10))| mmwr_week | agegroup | sex | mean_deaths | sd |
|---|---|---|---|---|
| 1 | 0-4 | female | 6.09375 | 2.8438885 |
| 1 | 0-4 | male | 7.15625 | 4.5302523 |
| 1 | 5-9 | female | 0.50000 | 0.8424235 |
| 1 | 5-9 | male | 0.59375 | 0.8370214 |
| 1 | 10-14 | female | 0.56250 | 0.8775883 |
| 1 | 10-14 | male | 0.90625 | 0.8175248 |
| 1 | 15-19 | female | 0.96875 | 1.2822454 |
| 1 | 15-19 | male | 3.53125 | 1.8136467 |
| 1 | 20-24 | female | 1.40625 | 1.2406911 |
| 1 | 20-24 | male | 6.40625 | 2.3808290 |
# some age groups have similar trends
t2 %>%
ggplot(aes(x = mmwr_week, y = mean_deaths, color = sex)) +
geom_line() +
geom_ribbon(aes(ymin = mean_deaths - sd,
ymax = mean_deaths + sd,
fill = sex), alpha = 0.2, color = NA) +
facet_wrap(~ agegroup, scales = "free_y") +
labs(title = "Trend of Mean Mortality by Age and Gender",
x = "MMWR Week",
y = "Mean Deaths")# creating a df with larger age groups
new_dat <- dat %>%
mutate(mmwr_week = epiweek(date),
mmwr_year = epiyear(date),
agegroup_new = case_when(
agegroup %in% c("0-4", "5-9", "10-14", "15-19", "20-24") ~ "0-24",
agegroup %in% c("25-29", "30-34", "35-39", "40-44", "45-49",
"50-54", "55-59", "60-64") ~ "25-64",
agegroup %in% c("65-69", "70-74", "75-79", "80-84") ~ "65-84",
agegroup == "85-Inf" ~ "85+")) %>%
group_by(agegroup_new, sex, mmwr_week, mmwr_year) %>%
summarise(total_deaths = sum(outcome, na.rm = TRUE), population = mean(population)) %>%
ungroup()`summarise()` has grouped output by 'agegroup_new', 'sex', 'mmwr_week'. You can
override using the `.groups` argument.
t3 <- new_dat %>%
# looking at trends/ estimate what a typical week looks like across years
group_by(mmwr_week, agegroup_new, sex) %>%
summarise(mean_deaths = mean(total_deaths),
sd = sd(total_deaths)) %>%
ungroup() `summarise()` has grouped output by 'mmwr_week', 'agegroup_new'. You can
override using the `.groups` argument.
kable(head(t3, 10))| mmwr_week | agegroup_new | sex | mean_deaths | sd |
|---|---|---|---|---|
| 1 | 0-24 | female | 8.736842 | 4.195778 |
| 1 | 0-24 | male | 17.000000 | 7.128170 |
| 1 | 25-64 | female | 50.631579 | 7.290946 |
| 1 | 25-64 | male | 108.236842 | 18.397171 |
| 1 | 65-84 | female | 113.763158 | 20.674353 |
| 1 | 65-84 | male | 141.921053 | 26.324772 |
| 1 | 85+ | female | 80.078947 | 26.970837 |
| 1 | 85+ | male | 60.921053 | 21.364762 |
| 2 | 0-24 | female | 9.289474 | 5.598783 |
| 2 | 0-24 | male | 18.500000 | 9.146702 |
t3 %>%
ggplot(aes(x = mmwr_week, y = mean_deaths, color = sex)) +
geom_line() +
geom_ribbon(aes(ymin = mean_deaths - sd,
ymax = mean_deaths + sd,
fill = sex), alpha = 0.2, color = NA) +
facet_wrap(~ agegroup_new, scales = "free_y") +
labs(title = "Trend of Mean Mortality by Age and Gender",
x = "MMWR Week",
y = "Mean Deaths")Linear Model:
# pre-2017 data
model_dataset <- new_dat %>%
filter(mmwr_year < 2017) %>%
mutate(rate = total_deaths/population,
age = as.factor(agegroup_new))
# linear model fit on data before 2017
linear <- lm(rate ~ population + mmwr_week + age + sex, data = model_dataset)
summary(linear)
Call:
lm(formula = rate ~ population + mmwr_week + age + sex, data = model_dataset)
Residuals:
Min 1Q Median 3Q Max
-1.630e-03 -2.093e-04 -4.560e-06 1.816e-04 1.846e-03
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.739e-03 3.805e-05 45.71 <2e-16 ***
population -1.258e-08 2.525e-10 -49.83 <2e-16 ***
mmwr_week -2.074e-06 2.001e-07 -10.36 <2e-16 ***
age25-64 1.975e-04 1.186e-05 16.65 <2e-16 ***
age65-84 1.262e-03 2.506e-05 50.35 <2e-16 ***
age85+ 1.067e-03 3.112e-05 34.28 <2e-16 ***
sexmale 4.230e-04 6.321e-06 66.92 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.0003484 on 13353 degrees of freedom
Multiple R-squared: 0.917, Adjusted R-squared: 0.917
F-statistic: 2.46e+04 on 6 and 13353 DF, p-value: < 2.2e-16
# check if there are negative fitted values
perc_negative <- mean(linear$fitted.values < 0)
perc_negative[1] 0.08787425
Table of Expected Mortality and SD of Expected Mortality
t4 <- model_dataset %>%
mutate(fitted = linear$fitted.values) %>%
group_by(mmwr_week, age, sex) %>%
summarise(exp_mortality = mean(fitted),
sd_exp_mortality = sd(fitted))`summarise()` has grouped output by 'mmwr_week', 'age'. You can override using
the `.groups` argument.
kable(head(t4))| mmwr_week | age | sex | exp_mortality | sd_exp_mortality |
|---|---|---|---|---|
| 1 | 0-24 | female | -0.0000486 | 0.0002048 |
| 1 | 0-24 | male | 0.0003213 | 0.0001978 |
| 1 | 25-64 | female | 0.0004649 | 0.0001194 |
| 1 | 25-64 | male | 0.0010539 | 0.0001066 |
| 1 | 65-84 | female | 0.0023168 | 0.0001581 |
| 1 | 65-84 | male | 0.0028658 | 0.0001134 |
## findingoverall mean and sd of mortality
t4 %>%
ungroup() %>%
summarise(mean = mean(exp_mortality),
sd(sd_exp_mortality))# A tibble: 1 × 2
mean `sd(sd_exp_mortality)`
<dbl> <dbl>
1 0.00149 0.0000491
Linear Model Diagnostics
resids <- residuals(linear)
# QQ plot
ggplot(data.frame(residuals = resids), aes(sample = residuals)) +
stat_qq(size = 1.5, color = "darkblue", alpha = 0.6) +
stat_qq_line(color = "orange", size = 1) +
labs(title = "Normal Q-Q Plot of Model Residuals",
x = "Theoretical Quantiles",
y = "Sample Quantiles") +
theme_minimal()Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
# Residuals vs Fitted Plot
model_df <- data.frame(
fitted = fitted(linear),
residuals = residuals(linear)
)
ggplot(model_df, aes(x = fitted, y = residuals)) +
geom_point(alpha = 0.4, color = "darkblue", size = 1.5) +
geom_hline(yintercept = 0, color = "indianred") +
geom_smooth(method = "loess", se = FALSE, color = "orange", linetype = "solid") +
labs(title = "Residuals vs Fitted Values",
x = "Fitted Values",
y = "Residuals") +
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
Fitting a linear model, we see issues such as violations of homoskedasticity (funnel shape) and normality (tails of qq plot). We also see negative predictions for a non-negative outcome (rate).
Log-Linear Model:
glm_dataset <- new_dat %>% filter(mmwr_year < 2017)
log_linear <- glm(total_deaths ~ mmwr_week + agegroup_new + sex,
offset = log(population),
family = poisson,
data = glm_dataset)
summary(log_linear)
Call:
glm(formula = total_deaths ~ mmwr_week + agegroup_new + sex,
family = poisson, data = glm_dataset, offset = log(population))
Deviance Residuals:
Min 1Q Median 3Q Max
-6.9056 -1.4332 -0.1159 1.2604 8.9106
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -9.498e+00 5.171e-03 -1836.76 <2e-16 ***
mmwr_week -9.211e-04 6.973e-05 -13.21 <2e-16 ***
agegroup_new25-64 1.970e+00 5.056e-03 389.72 <2e-16 ***
agegroup_new65-84 3.239e+00 4.910e-03 659.68 <2e-16 ***
agegroup_new85+ 3.326e+00 5.142e-03 646.95 <2e-16 ***
sexmale 4.929e-01 2.124e-03 232.06 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 1128141 on 13359 degrees of freedom
Residual deviance: 49066 on 13354 degrees of freedom
AIC: 125630
Number of Fisher Scoring iterations: 4
#GOF: prob of observing this deviance value given the df
1-pchisq(log_linear$deviance, log_linear$df.residual)[1] 0
We next fit a log-linear model with mmwr_week, age, and sex again as predictors. We again seek to predict weekly death rate, this time modeling total deaths and using an offset equal to population size for a given week, gender, age, and year. Univariate Wald tests show that each coefficient is significantly associated with our outcome of interest, and a comparison of null and residual deviances shows that our model fits significantly better than the intercept-only model (LRT: 1079075 on 5 degrees of freedom).
Goodness of Fit: Under the null, the residual deviance should be distributed as a \(X^2\) random variable with degrees of freedom equal to the model’s residual degrees of freedom, 13354. We observe a residual deviance value of 49066, which naturally will increase with the total sample size unless the model fits extremely well. The probability of observing such a residual deviance value given that the fit model holds is essentially zero, indicating a lack of model fit.
####Assessing for Overdispersion:
plot(log_linear, which = 3)overdisp <- log_linear$deviance/log_linear$df.residual
overdisp[1] 3.67425
There is an upward trend in the scale-location plot for predicted values vs standardized pearson residuals. In a well-fitting model, the ratio of residual deviance to the degrees of freedom should be approximately 1 since the poisson distribution assumes that the mean and variance are exactly equal. In this case, the ratio value is 3.67 indicating that overdispersion is present and that we may be underestimating the standard error of our data.
Negative Binomial Regression:
neg_bin <- glm.nb(total_deaths ~ mmwr_week + agegroup_new + sex + agegroup_new*sex + offset(log(population)),
link = log,
data = glm_dataset)
summary(neg_bin)
Call:
glm.nb(formula = total_deaths ~ mmwr_week + agegroup_new + sex +
agegroup_new * sex + offset(log(population)), data = glm_dataset,
link = log, init.theta = 47.20250994)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.7721 -0.7268 -0.0438 0.6482 8.1872
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -9.6721006 0.0095420 -1013.633 < 2e-16 ***
mmwr_week -0.0008334 0.0001157 -7.202 5.95e-13 ***
agegroup_new25-64 1.8978371 0.0103386 183.567 < 2e-16 ***
agegroup_new65-84 3.4817399 0.0100028 348.078 < 2e-16 ***
agegroup_new85+ 3.6702479 0.0101586 361.294 < 2e-16 ***
sexmale 0.7490751 0.0112316 66.693 < 2e-16 ***
agegroup_new25-64:sexmale 0.1322616 0.0130319 10.149 < 2e-16 ***
agegroup_new65-84:sexmale -0.3412163 0.0127178 -26.830 < 2e-16 ***
agegroup_new85+:sexmale -0.5914260 0.0131053 -45.129 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Negative Binomial(47.2025) family taken to be 1)
Null deviance: 475292 on 13359 degrees of freedom
Residual deviance: 13764 on 13351 degrees of freedom
AIC: 101244
Number of Fisher Scoring iterations: 1
Theta: 47.20
Std. Err.: 1.02
2 x log-likelihood: -101223.91
1-pchisq(neg_bin$deviance, neg_bin$df.residual)[1] 0.006131741
Comparing this model with the log-linear model on AIC, we conclude that the negative binomial model fits better given its lower AIC value (125630 vs 101244). However, the probability of observing such a residual deviance value given that the fit model holds is still very low (p = 0.0061), indicating a lack of model fit despite improvements. A few reasons for this include that we are not accounting for autocorrelation in our data (since we’re looking at the population of Puerto Rico over time), seasonal effects on mortality rates, and the overall downward trend in mortality rates due to factors like improvements in healthcare.
Table of Expected Mortality and SD of Expected Mortality
t5 <- glm_dataset %>%
mutate(fitted = neg_bin$fitted.values/population) %>%
group_by(mmwr_week, agegroup_new, sex) %>%
summarise(exp_mortality = mean(fitted),
sd_exp_mortality = sd(fitted))`summarise()` has grouped output by 'mmwr_week', 'agegroup_new'. You can
override using the `.groups` argument.
kable(head(t5))| mmwr_week | agegroup_new | sex | exp_mortality | sd_exp_mortality |
|---|---|---|---|---|
| 1 | 0-24 | female | 0.0000630 | 0 |
| 1 | 0-24 | male | 0.0001332 | 0 |
| 1 | 25-64 | female | 0.0004201 | 0 |
| 1 | 25-64 | male | 0.0010141 | 0 |
| 1 | 65-84 | female | 0.0020474 | 0 |
| 1 | 65-84 | male | 0.0030784 | 0 |
t5 %>%
ungroup() %>%
summarise(mean = mean(exp_mortality),
sd(sd_exp_mortality))# A tibble: 1 × 2
mean `sd(sd_exp_mortality)`
<dbl> <dbl>
1 0.00148 6.04e-19
Question 3: Periods with Excess Mortality Before 2018
Linear Model
# Looking only at data before 2018
predict_pre <- new_dat %>%
filter(mmwr_year <= 2018) %>%
arrange(mmwr_year, mmwr_week) %>%
mutate(rate = total_deaths/population,
age = as.factor(agegroup_new)) %>%
group_by(agegroup_new, sex) %>%
mutate(week_index = row_number()) %>%
ungroup()
# getting predictions
predict_pre$predicted_rate <- predict(linear, newdata = predict_pre)
# excess mortality (obs - predicted)
predict_pre <- predict_pre %>%
mutate(predicted_deaths = predicted_rate * population,
excess_mortality = rate - predicted_rate,
excess_deaths = total_deaths - predicted_deaths)
#observed v fitted over time: looking at sex differences
predict_pre %>% ggplot(aes(x = week_index, y = excess_mortality)) +
geom_smooth(aes(color = sex, group = sex), method = "loess", se = FALSE, span = 0.1) +
labs(title = "Excess Mortality over Time with Linear Model: Before 2018",
subtitle = "Differences by Sex",
x = "Week", y = "Excess Mortality",
color = 'Sex') +
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
predict_pre %>% ggplot(aes(x = week_index, y = excess_mortality)) +
geom_smooth(aes(color = agegroup_new, group = agegroup_new), method = "loess", se = FALSE, span = 0.1) +
labs(title = "Excess Mortality over Time with Linear Model: Before 2018",
subtitle = 'Differences by Age Group',
x = "Week", y = "Excess Mortality",
color = 'Age Group') +
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
Negative Binomial Model
# Looking only at data before 2018
predictnb_pre <- new_dat %>%
filter(mmwr_year <= 2018) %>%
arrange(mmwr_year, mmwr_week) %>%
mutate(rate = total_deaths/population,
age = as.factor(agegroup_new)) %>%
group_by(agegroup_new, sex) %>%
mutate(week_index = row_number()) %>%
ungroup()
# use nb model to predict rates for data before 2018
predictnb_pre$predicted_rate <- predict(neg_bin, type = 'response', newdata = predictnb_pre) / predictnb_pre$population
# excess mortality (obs - predicted)
predictnb_pre <- predictnb_pre %>%
mutate(true_rate = total_deaths/population,
excess_mortality = true_rate - predicted_rate)
#observed v fitted over time: looking at sex differences
predictnb_pre %>% ggplot(aes(x = week_index, y = excess_mortality)) +
geom_smooth(aes(color = sex, group = sex), method = "loess", se = FALSE, span = 0.1) +
labs(title = "Excess Mortality over Time with Negtive Binomial Model: Before 2018",
subtitle = "Differences by Sex",
x = "Week", y = "Excess Mortality",
color = 'Sex')+
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
predictnb_pre %>% ggplot(aes(x = week_index, y = excess_mortality)) +
geom_smooth(aes(color = agegroup_new, group = agegroup_new), method = "loess", se = FALSE, span = 0.1) +
labs(title = "Excess Mortality over Time with Negative Binomial Model: Before 2018",
subtitle = 'Differences by Age Group',
x = "Week", y = "Excess Mortality",
color = 'Age Group') +
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
Based on the predictions from the linear and negative binomial models, there appears to be excess mortality in the year 1985, decreasing from a peak outside of the dataset. This is particularly seen in older age demographics. One potential cause of this could have been the 1985 Puerto Rico floods, which triggered the deadliest single landslide ever recorded in North America.
Recomputing Expected Counts
predictnb_pre %>% filter(week_index >= 65) %>%
ggplot(aes(x = week_index, y = excess_mortality)) +
geom_smooth(aes(color = agegroup_new, group = agegroup_new), method = "loess", se = FALSE, span = 0.1) +
labs(title = "Excess Mortality over Time with Negative Binomial Model: Before 2018",
subtitle = 'Removing up to Week 13 of 1986',
x = "Week", y = "Excess Mortality",
color = 'Age Group') +
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
#table of expected counts:
t6 <- predictnb_pre %>% filter(week_index >= 65) %>%
group_by(mmwr_week, agegroup_new, sex) %>%
summarise(exp_mortality = mean(predicted_rate),
sd_exp_mortality = sd(predicted_rate))`summarise()` has grouped output by 'mmwr_week', 'agegroup_new'. You can
override using the `.groups` argument.
kable(head(t6))| mmwr_week | agegroup_new | sex | exp_mortality | sd_exp_mortality |
|---|---|---|---|---|
| 1 | 0-24 | female | 0.0000630 | 0 |
| 1 | 0-24 | male | 0.0001332 | 0 |
| 1 | 25-64 | female | 0.0004201 | 0 |
| 1 | 25-64 | male | 0.0010141 | 0 |
| 1 | 65-84 | female | 0.0020474 | 0 |
| 1 | 65-84 | male | 0.0030784 | 0 |
Question 4: Predictions for 2017-2018
Linear Prediction
# 2017-2018 data used for prediction
predict_lm <- new_dat %>%
filter(mmwr_year >= 2017 & mmwr_year <= 2018) %>%
mutate(rate = total_deaths/population,
age = as.factor(agegroup_new))
# use linear model to predict data 2017 and after
predict_lm$predicted_rate <- predict(linear, newdata = predict_lm)
# excess mortality (obs - predicted)
predict_lm <- predict_lm %>%
mutate(predicted_deaths = predicted_rate * population,
excess_mortality = rate - predicted_rate,
excess_deaths = total_deaths - predicted_deaths)
# observed vs fitted plots
ggplot(predict_lm, aes(x = rate, y = predicted_rate, color = agegroup_new)) +
geom_point(alpha = 0.5) +
geom_abline(intercept = 0, slope = 1, color = "red", size = 0.8) +
labs(title = "Predicted vs. Observed Mortality Rate",
x = "Observed Rate", y = "Predicted Rate",
color = 'Age Group') +
theme_minimal()# plot of excess mortality over weeks
plot_df <- predict_lm %>%
arrange(mmwr_year, mmwr_week) %>%
mutate(week_index = row_number()) %>%
dplyr::select(week_index, rate, predicted_rate, sex, agegroup_new)
# pivot_longer(cols = c("rate", "predicted_rate"),
# names_to = "type",
# values_to = "mortality_rate")
ggplot(plot_df, aes(x = week_index, y = rate)) +
geom_smooth(method = "lm", se = TRUE, linewidth = 1) +
geom_point(aes(x = week_index, y = rate), alpha = 0.6, color = "pink") +
facet_grid(sex ~ agegroup_new) +
labs(title = "Obs. vs. Fitted Mortality Rates",
y = "Obs. Mortality Rate",
x = "Week Index") +
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
# comparing excess mortality by gender
predict_lm %>% ggplot(aes(x = mmwr_week, y = excess_mortality)) +
geom_smooth(aes(color = sex, group = sex), method = "loess", se = FALSE, span = 0.2) +
facet_wrap(~mmwr_year) +
labs(title = "Excess Mortality over Time: Linear Model",
subtitle = "Differences in Excess Mortality by Sex",
x = "Week", y = "Excess Mortality",
color = 'Sex') +
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
# comparing excess mortality by age
predict_lm %>% ggplot(aes(x = mmwr_week, y = excess_mortality)) +
geom_smooth(aes(color = agegroup_new, group = agegroup_new), method = "loess", se = FALSE, span = 0.2) +
facet_wrap(~mmwr_year) +
labs(title = "Excess Mortality over Time: Linear Model",
subtitle = "Differences in Excess Mortality by Age Group",
x = "Week", y = "Excess Mortality",
color = 'Age Group') +
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
In addition to the model violating the linear model assumptions, we can see that the model does not predict well and a better suited model should be used. From the Predicted vs Observed Rates plot, we can see that for the most part, the linear model is overestimating the actual mortality rate.
Looking at excess mortality, we can see that almost all throughout 2017-2018, excess mortality was negative for both males and female, again indicating that the linear model was overestimating mortality rates. However, it is obvious that Hurricane Maria made landfall around weeks 38/39 when excess mortality spiked for both males and females (above 0) showing how the linear model then underestimated the mortality rates given the unexpected natural disaster. There is a difference in excess mortality between males and females thoughtout most of 2017-2018 with male excess mortality being lower, perhaps indicating higher baseline mortality predictions for males.
When looking at age groups, we can tell that excess mortality was negative for the part during 2017-2018 with a spike when Hurricane Maria made landfall. The linear model was especially bad at predicting mortality rates for the 0-24 age groups and there was also no spike in ecess mortality for this age group when Hurricane Maria made Landfall. Same with 25-64 age groups but not as strong.
The hurricane seemed to affect older age groups with both males and females equally affected.
Negative Binomal Prediction:
# 2017-2018 data used for prediction
predict_nb <- new_dat %>%
filter(mmwr_year >= 2017 & mmwr_year <= 2018) %>%
mutate(age = as.factor(agegroup_new))
# use nb model to predict data 2017 and after
predict_nb$predicted_rate <- predict(neg_bin, type = 'response', newdata = predict_nb) / predict_nb$population
# excess mortality (obs - predicted)
predict_nb <- predict_nb %>%
mutate(true_rate = total_deaths/population,
excess_mortality = true_rate - predicted_rate)
# comparing true vs expected predicted rate (excess mortality)
predict_nb <- predict_nb %>%
mutate(mmwr_label = paste0(mmwr_year, "-W", stringr::str_pad(mmwr_week, 2, pad = "0")))
# observed vs fitted plots
ggplot(predict_nb, aes(x = true_rate, y = predicted_rate, color = agegroup_new)) +
geom_point(alpha = 0.5) +
geom_abline(intercept = 0, slope = 1, color = "red", size = 0.8) +
labs(title = "Observed vs Predicted Mortality Rate: Negative Binomial Model",
x = "Observed Rate", y = "Predicted Rate",
color = 'Age Group') +
theme_minimal()Excess Mortality over Time
#observed v fitted over time: looking at sex differences
predict_nb %>% ggplot(aes(x = mmwr_week, y = excess_mortality)) +
geom_smooth(aes(color = sex, group = sex), method = "loess", se = FALSE, span = 0.2) +
facet_wrap(~mmwr_year) +
labs(title = "Excess Mortality over Time: Negative Binomial Model",
subtitle = "Differences in Excess Mortality by Sex",
x = "Week", y = "Excess Mortality",
color = 'Sex') +
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
#observed v fitted over time: looking at age group differences
predict_nb %>% ggplot(aes(x = mmwr_week, y = excess_mortality)) +
geom_smooth(aes(color = agegroup_new, group = agegroup_new), method = "loess", se = FALSE, span = 0.2) +
facet_wrap(~mmwr_year) +
labs(title = "Excess Mortality over Time: Negative Binomial Model",
subtitle = "Differences in Excess Mortality by Age Group",
x = "Week", y = "Excess Mortality",
color = 'Age Group') +
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
Question 5: Comparison of NYT Data and excessmort Data
pdf_url <- "https://github.com/c2-d2/pr_mort_official/raw/master/data/Mortalidad-RegDem-2015-17-NYT-part1.pdf"
# View(extract_tables(pdf_url, pages = 1)[[1]])
# Download PDF to a temporary file
temp_file <- tempfile(fileext = ".pdf")
download.file(pdf_url, temp_file, mode = "wb")
# extract text from PDF
pdf_text_data <- pdf_text(temp_file)
# view first page of pdf file
# cat(pdf_text_data[1])
extract_table_from_page <- function(page_text) {
lines <- strsplit(page_text, "\n")[[1]]
lines <- trimws(lines)
# Keep lines that start with day numbers (1-31)
table_lines <- grep("^\\d{1,2}\\s", lines, value = TRUE)
# Split on 2+ spaces, assuming table is aligned that way
split_lines <- str_split_fixed(table_lines, "\\s{2,}", 5)
colnames(split_lines) <- c("Day", "Y2015", "Y2016", "Y2017", "Diff")
# Convert to a tibble and clean up
as_tibble(split_lines) %>%
mutate(across(everything(), str_trim)) %>%
mutate(across(-Day, ~ as.numeric(str_replace_all(., "[^0-9\\.-]", ""))))
}
# Apply the extraction function to each page
tables_list <- lapply(pdf_text_data, extract_table_from_page)
# Combine into one data frame
full_table <- bind_rows(tables_list, .id = "Page")
full_table <- full_table %>%
mutate(Month = ifelse(Page == 1, "Sep",
ifelse(Page == 2, "Oct",
ifelse(Page == 3, "Nov",
ifelse(Page == 4, "Dec",
ifelse(Page == 5, "Jan",
ifelse(Page == 6, "Feb",
ifelse(Page == 7, "Mar",
ifelse(Page == 8, "Apr",
ifelse(Page == 9, "May",
ifelse(Page == 10, "Jun",
ifelse(Page == 11, "Jul", "Aug")))))))))))) %>%
dplyr::select(Page, Month, Y2015, Y2016, Y2016, Y2017, Diff) %>%
rownames_to_column("index")
# fixing nov data
full_table <- full_table %>%
filter(!(index %in% c(84:88)))
# fixing dec data
full_table[full_table$Page == 4, "Y2017"] <- 0
full_table <- full_table %>%
filter(!(index %in% c(115)))
# fixing jan data
full_table <- full_table %>%
filter(!(index %in% c(147)))
# fixing feb data
full_table <- full_table %>%
filter(!(index %in% c(179)))
full_table[full_table$index == 190, "Y2015"] <- NA
full_table[full_table$index == 190, "Y2016"] <- 70
# fixing march data
full_table <- full_table %>%
filter(!(index %in% c(209)))
# fixing april data
full_table <- full_table %>%
filter(!(index %in% c(241)))
# fixing may data
full_table <- full_table %>%
filter(!(index %in% c(272)))
# fixing june date
full_table <- full_table %>%
filter(!(index %in% c(306)))
# fixing july date
full_table <- full_table %>%
filter(!(index %in% c(335)))
# fixing aug date
full_table <- full_table %>%
filter(!(index %in% c(367)))
# View(full_table[full_table$Page == 12, ])
full_table <- full_table %>%
mutate(diff = Y2017 - Y2016,
Page = as.numeric(Page),
day = c(c(1:30), c(1:31),
c(1:30), c(1:31),
c(1:31), c(1:29),
c(1:31), c(1:30),
c(1:31), c(1:30),
c(1:31), c(1:31))) %>%
mutate(Month = factor(Month, levels = c("Jan", "Feb", "Mar", "Apr",
"May", "Jun", "Jul", "Aug",
"Sep", "Oct", "Nov", "Dec")),
nyt_diff = diff,
month = Month) %>%
arrange(month) %>%
dplyr:: select(month, day, Y2015, Y2016, Y2017, nyt_diff)
kable(full_table %>% group_by(month) %>% slice(1:3))| month | day | Y2015 | Y2016 | Y2017 | nyt_diff |
|---|---|---|---|---|---|
| Jan | 1 | 107 | 89 | 107 | 18 |
| Jan | 2 | 101 | 88 | 108 | 20 |
| Jan | 3 | 78 | 79 | 115 | 36 |
| Feb | 1 | 66 | 111 | 93 | -18 |
| Feb | 2 | 114 | 95 | 88 | -7 |
| Feb | 3 | 95 | 119 | 68 | -51 |
| Mar | 1 | 82 | 73 | 73 | 0 |
| Mar | 2 | 92 | 69 | 89 | 20 |
| Mar | 3 | 95 | 67 | 57 | -10 |
| Apr | 1 | 70 | 79 | 89 | 10 |
| Apr | 2 | 85 | 53 | 94 | 41 |
| Apr | 3 | 80 | 77 | 81 | 4 |
| May | 1 | 66 | 74 | 62 | -12 |
| May | 2 | 87 | 59 | 93 | 34 |
| May | 3 | 77 | 74 | 73 | -1 |
| Jun | 1 | 68 | 74 | 75 | 1 |
| Jun | 2 | 76 | 71 | 75 | 4 |
| Jun | 3 | 62 | 70 | 84 | 14 |
| Jul | 1 | 71 | 84 | 78 | -6 |
| Jul | 2 | 67 | 75 | 82 | 7 |
| Jul | 3 | 85 | 69 | 80 | 11 |
| Aug | 1 | 69 | 65 | 105 | 40 |
| Aug | 2 | 72 | 79 | 73 | -6 |
| Aug | 3 | 74 | 78 | 88 | 10 |
| Sep | 1 | 75 | 75 | 92 | 17 |
| Sep | 2 | 77 | 67 | 69 | 2 |
| Sep | 3 | 67 | 78 | 78 | 0 |
| Oct | 1 | 74 | 90 | 101 | 11 |
| Oct | 2 | 84 | 67 | 104 | 37 |
| Oct | 3 | 77 | 82 | 118 | 36 |
| Nov | 1 | 77 | 84 | 78 | -6 |
| Nov | 2 | 71 | 72 | 90 | 18 |
| Nov | 3 | 83 | 74 | 68 | -6 |
| Dec | 1 | 82 | 77 | 0 | -77 |
| Dec | 2 | 87 | 88 | 0 | -88 |
| Dec | 3 | 83 | 67 | 0 | -67 |
# comparing nyt data w excessmort data
nyt_check_df <- dat %>%
mutate(year = year(date),
month = month(date, label = TRUE),
day = day(date)) %>%
dplyr::select(day, month, year, outcome) %>%
filter(year >= 2015 & year <= 2017) %>%
group_by(day, month, year) %>%
summarise(total_deaths = sum(outcome)) %>%
pivot_wider(names_from = year, values_from = total_deaths) %>%
mutate(og_diff = `2017`-`2016`,
month = factor(as.character(month), levels = c("Jan", "Feb", "Mar", "Apr",
"May", "Jun", "Jul", "Aug",
"Sep", "Oct", "Nov", "Dec"))) %>%
arrange(month)`summarise()` has grouped output by 'day', 'month'. You can override using the
`.groups` argument.
kable(nyt_check_df %>% group_by(month) %>% slice(1:3)) | day | month | 2015 | 2016 | 2017 | og_diff |
|---|---|---|---|---|---|
| 1 | Jan | 107 | 89 | 107 | 18 |
| 2 | Jan | 101 | 88 | 108 | 20 |
| 3 | Jan | 78 | 79 | 115 | 36 |
| 1 | Feb | 66 | 111 | 93 | -18 |
| 2 | Feb | 114 | 95 | 88 | -7 |
| 3 | Feb | 95 | 119 | 68 | -51 |
| 1 | Mar | 82 | 74 | 73 | -1 |
| 2 | Mar | 92 | 71 | 89 | 18 |
| 3 | Mar | 95 | 67 | 57 | -10 |
| 1 | Apr | 70 | 79 | 89 | 10 |
| 2 | Apr | 85 | 53 | 94 | 41 |
| 3 | Apr | 80 | 77 | 81 | 4 |
| 1 | May | 66 | 74 | 62 | -12 |
| 2 | May | 87 | 59 | 94 | 35 |
| 3 | May | 77 | 74 | 73 | -1 |
| 1 | Jun | 68 | 73 | 75 | 2 |
| 2 | Jun | 76 | 70 | 75 | 5 |
| 3 | Jun | 62 | 70 | 82 | 12 |
| 1 | Jul | 71 | 84 | 78 | -6 |
| 2 | Jul | 67 | 75 | 84 | 9 |
| 3 | Jul | 85 | 69 | 82 | 13 |
| 1 | Aug | 65 | 69 | 106 | 37 |
| 2 | Aug | 79 | 72 | 75 | 3 |
| 3 | Aug | 78 | 74 | 89 | 15 |
| 1 | Sep | 75 | 75 | 95 | 20 |
| 2 | Sep | 77 | 67 | 69 | 2 |
| 3 | Sep | 67 | 78 | 80 | 2 |
| 1 | Oct | 74 | 90 | 104 | 14 |
| 2 | Oct | 85 | 67 | 108 | 41 |
| 3 | Oct | 77 | 82 | 122 | 40 |
| 1 | Nov | 77 | 84 | 84 | 0 |
| 2 | Nov | 71 | 72 | 95 | 23 |
| 3 | Nov | 83 | 74 | 72 | -2 |
| 1 | Dec | 82 | 77 | 91 | 14 |
| 2 | Dec | 87 | 88 | 89 | 1 |
| 3 | Dec | 83 | 68 | 80 | 12 |
# calculating differences between the two datasets
final_comparison <- inner_join(nyt_check_df %>% dplyr::select(day, month, og_diff),
full_table %>% dplyr::select(day, month, nyt_diff),
by = c("day", "month")) %>%
mutate(difference = og_diff - nyt_diff)
final_comparison$week_index <- c(1:nrow(final_comparison))
mean(final_comparison$difference, na.rm = TRUE)[1] 10.89863
# plot of differences
ggplot(final_comparison, aes(x = week_index, y = difference)) +
geom_line(color = "darkblue") +
labs(title = "Difference in Calculated Total Deaths Between NYT and excessmort",
subtitle = "Difference Between 2016-2017 ",
x = "Day Index",
y = "Difference") +
theme_minimal()In certain months there are big differences between the two data sources when comparing the calculated 2016-2017 total death differences per day. These differences mostly stem from differences in counts in August as well as the NYT data source completely missing 2017 death data.